# =======================================
# Load required libraries
# =======================================
library(data.table)
library(dplyr)
library(ggplot2)
library(ggsci)
library(ggthemes)
library(ggpubr)
library(ggExtra)
library(RColorBrewer)
library(tidyr)
library(viridis)
library(gridExtra)
library(cowplot)
library(phytools)
library(ggtree)
library(ape)
library(patchwork)
# Fonts
library(extrafont)
library(fontcm)
#font_import(prompt = FALSE)
#font_install('fontcm')
# Summary statistics
library(doBy)
# =======================================
# Set working directory
# =======================================
setwd("~/work/sylvietta_PAR/results/")
# Avoid scientific notation in plots/tables
options(scipen = 999)
# =======================================
# Exon completeness dataset
# =======================================
# Load Sylvioidea exon data
PAR.genes.bed <- fread("datasets/allSp.PAR.genes.bed", fill = TRUE, header = FALSE, stringsAsFactors = FALSE)
colnames(PAR.genes.bed) <- c("contig", "contig_start", "contig_end", "Gene", "Trans", "exon_nr", "species")
PAR.genes.bed$species <- sub("_1EV02922", "", PAR.genes.bed$species)
PAR.genes.bed <- subset(PAR.genes.bed, c(species != "AlaRaz" & species != "GalMod"))
# Load Zebra Finch PAR data and merge
ZF.PAR <- fread("datasets/ZF.PAR.genes.bed", fill = TRUE, header = FALSE, stringsAsFactors = FALSE)
colnames(ZF.PAR) <- c("contig", "contig_start", "contig_end", "Gene", "Trans", "exon_nr", "species")
PAR.genes.bed <- rbind(PAR.genes.bed, ZF.PAR)
# Load Zebra Finch genomic positions (on chrZ)
ZF.PAR.genes.bed <- fread("datasets/ZF.PAR.genes.bed", fill = TRUE, header = FALSE, stringsAsFactors = FALSE)
colnames(ZF.PAR.genes.bed) <- c("chr", "chr_start", "chr_end", "Gene", "Trans", "exon_nr", "ZF")
# Merge Sylvioidea and ZF data by gene ID
PAR.genes.bed <- merge(PAR.genes.bed, ZF.PAR.genes.bed, by = c("Trans", "Gene", "exon_nr"))
# Calculate exon completeness (as % of reference length)
PAR.genes.bed$perc_found <- (PAR.genes.bed$contig_end - PAR.genes.bed$contig_start) /
(PAR.genes.bed$chr_end - PAR.genes.bed$chr_start)
# Add gene names
geneName <- fread("datasets/PAR_genes_geneID_transID_geneName.list", fill = TRUE, header = FALSE, stringsAsFactors = FALSE)
colnames(geneName) <- c("name", "Gene", "Trans")
PAR.genes.bed <- merge(PAR.genes.bed, geneName, by = c("Trans", "Gene"))
# =======================================
# Plot exon completeness
# =======================================
pdf("~/work/PAR/results/plots/allSp_exon_completeness.pdf", height = 10, width = 20)
ggplot(PAR.genes.bed, aes(y = species, x = exon_nr, fill = as.factor(round(perc_found, 1)))) +
geom_point(pch = 22, size = 3) +
facet_wrap(~name) +
theme_tufte() +
scale_fill_startrek()
dev.off()
# =======================================
# Z Chromosome Analyses
# =======================================
# Load gene positions for ZF chrZ
ZF.Z.genes.bed <- fread("datasets/ZF.Z.genes.bed", fill = TRUE, header = FALSE, stringsAsFactors = FALSE)
colnames(ZF.Z.genes.bed) <- c("chr", "chr_start", "chr_end", "Gene", "Trans", "exon_nr", "ZF")
# Start position for each gene (used to sort/annotate later)
ZF.Z.genes.startPos <- aggregate(chr_start ~ chr + Gene + Trans, data = ZF.Z.genes.bed, FUN = max)
# Get list of PAR genes from ZF data
ZF.PAR <- fread("datasets/ZF.PAR.genes.bed", fill = TRUE, header = FALSE, stringsAsFactors = FALSE)
colnames(ZF.PAR) <- c("contig", "contig_start", "contig_end", "Gene", "Trans", "exon_nr", "species")
PAR.genes <- unique(ZF.PAR$Gene)
# Load Sylvioidea data for all Z genes
allSp.Z.genes.bed <- fread("datasets/allSp.Z.genes.bed", fill = TRUE, header = FALSE, stringsAsFactors = FALSE)
colnames(allSp.Z.genes.bed) <- c("contig", "contig_start", "contig_end", "Gene", "Trans", "exon_nr", "species")
allSp.Z.genes.bed <- subset(allSp.Z.genes.bed, c(species != "AlaRaz" & species != "GalMod"))
# Load sample information for Sylvioidea species
samples.sex <- fread("datasets/samples_sex.tsv", fill = TRUE, header = FALSE, stringsAsFactors = FALSE)
colnames(samples.sex) <- c("indv", "species", "sex")
samples.sex <- subset(samples.sex, c(species != "AlaRaz" & species != "GalMod"))
# List of all Z-linked genes per species (used for filling in singleton matrix)
fillInMissing <- fread("datasets/Z.genes.allSp.list", fill = TRUE, header = FALSE, stringsAsFactors = FALSE)
colnames(fillInMissing) <- c("chr", "Gene", "Trans", "species")
fillInMissing <- merge(fillInMissing, ZF.Z.genes.startPos, by = c("chr", "Gene", "Trans"))
# =======================================
# Load gene trees for analysis
# =======================================
setwd("~/work/PAR/mafft_ZF/")
# Load trees
LMAN1 <- read.tree("LMAN1.realn.fasta.auto.gt.0.8.trim.treefile")
uncharacterized2 <- read.tree("uncharacterized2.realn.fasta.auto.gt.0.8.trim.treefile")
RAX <- read.tree("RAX.realn.fasta.auto.gt.0.8.trim.treefile")
GRP <- read.tree("GRP.realn.fasta.auto.gt.0.8.trim.treefile")
SEC11C <- read.tree("SEC11C.realn.fasta.auto.gt.0.8.trim.treefile")
ZNF532 <- read.tree("ZNF532.realn.fasta.auto.gt.0.8.trim.treefile")
MALT1 <- read.tree("MALT1.realn.fasta.auto.gt.0.8.trim.treefile")
ALPK2 <- read.tree("ALPK2.realn.fasta.auto.gt.0.8.trim.treefile")
uncharacterized3 <- read.tree("uncharacterized3.realn.fasta.auto.gt.0.8.selectseqs.trim.treefile")
NEDD4L <- read.tree("NEDD4L.realn.fasta.auto.gt.0.8.trim.treefile")
uncharacterized4 <- read.tree("uncharacterized4.realn.fasta.auto.gt.0.8.selectseqs.trim.treefile")
ATP8B1 <- read.tree("ATP8B1.realn.fasta.auto.gt.0.8.trim.treefile")
NARS1 <- read.tree("NARS1.realn.fasta.auto.gt.0.8.trim.treefile")
FECH <- read.tree("FECH.realn.fasta.auto.gt.0.8.trim.treefile")
ST8SIA3 <- read.tree("ST8SIA3.realn.fasta.auto.gt.0.8.trim.treefile")
uncharacterized6 <- read.tree("uncharacterized6.realn.fasta.auto.gt.0.8.trim.treefile")
WDR7 <- read.tree("WDR7.realn.fasta.auto.gt.0.8.trim.treefile")
# Quick check of one tree
plot(WDR7)
# =======================================
# Extract pairwise cophenetic distances
# =======================================
# Helper function to extract distances with gene label
extract_distances <- function(tree, gene_name) {
dist_df <- melt(cophenetic.phylo(tree))
dist_df$Gene <- gene_name
return(dist_df)
}
# Create distance data for each gene
dist.list <- list(
extract_distances(RAX, "RAX"),
extract_distances(ATP8B1, "ATP8B1"),
extract_distances(LMAN1, "LMAN1"),
extract_distances(uncharacterized2, "uncharacterized2"),
extract_distances(GRP, "GRP"),
extract_distances(SEC11C, "SEC11C"),
extract_distances(ZNF532, "ZNF532"),
extract_distances(MALT1, "MALT1"),
extract_distances(ALPK2, "ALPK2"),
extract_distances(uncharacterized3, "uncharacterized3"),
extract_distances(NEDD4L, "NEDD4L"),
extract_distances(uncharacterized4, "uncharacterized4"),
extract_distances(NARS1, "NARS1"),
extract_distances(ST8SIA3, "ST8SIA3"),
extract_distances(uncharacterized6, "uncharacterized6"),
extract_distances(WDR7, "WDR7"),
extract_distances(FECH, "FECH")
)
# Combine all into a single data frame
dist.data <- rbindlist(dist.list)
# Remove self-self comparisons
dist.data <- subset(dist.data, Var1 != Var2)
# Extract species names from sample IDs
dist.data <- dist.data %>%
separate(Var1, into = c("Species1", "Species1_sex"), sep = "_") %>%
separate(Var2, into = c("Species2", "Species2_sex"), sep = "_")
# =======================================
# Filter for intraspecific pairs
# =======================================
# Keep only comparisons between individuals of the same species
sp.comp.dist.data <- dist.data %>%
filter(Species1 == Species2) %>%
select(Species1, Gene, value)
# Rename and order species factor
sp.comp.dist.data$Species1 <- sub("HirDau", "CecDau", sp.comp.dist.data$Species1)
sp.comp.dist.data$Species1 <- sub("TurAlt", "ArgAlt", sp.comp.dist.data$Species1)
sp.comp.dist.data$Species1 <- factor(sp.comp.dist.data$Species1, levels = c(
"TaeGut", "FicAlb", "CetCet", "AegCau", "PhyCol", "PycBar", "SylAtr", "ArgAlt",
"CecDau", "AcrSch", "LocLus", "CisJun", "SylBra", "PanBia", "EreAlp", "AlaArv"
))
sp.comp.dist.data <- unique(sp.comp.dist.data)
# =======================================
# Plot heatmap of distances
# =======================================
pdf("~/work/PAR/results/plots/PAR_heatmap.pdf")
ggplot(sp.comp.dist.data, aes(Gene, Species1, fill = value)) +
geom_tile(color = "black") +
scale_fill_distiller(palette = "YlOrRd", direction = 1) +
theme_tufte() +
theme(
axis.text.x = element_text(angle = 90, hjust = 1, size = 14),
axis.text.y = element_text(size = 14)
) +
labs(title = "", x = "", y = "", fill = "")
dev.off()
# =======================================
# Annotate species with fusion type
# =======================================
sp.comp.dist.data$type <- "Z + 4A"
sp.comp.dist.data$type[sp.comp.dist.data$Species1 %in% c("AlaArv")] <- "Z + 4A + 3 + 5"
sp.comp.dist.data$type[sp.comp.dist.data$Species1 %in% c("EreAlp", "PanBia")] <- "Z + 4A + 3"
sp.comp.dist.data$type[sp.comp.dist.data$Species1 %in% c("CisJun")] <- "Z + 4A + 4"
sp.comp.dist.data$type[sp.comp.dist.data$Species1 %in% c("SylBra")] <- "Z + 4A + 8"
sp.comp.dist.data$type[sp.comp.dist.data$Species1 %in% c("TaeGut", "FicAlb")] <- "Z"
# =======================================
# Summary statistics by group
# =======================================
summary_stats <- summaryBy(value ~ type + Species1, data = sp.comp.dist.data,
keep.names = TRUE, FUN = c(median, sd, IQR))
print(summary_stats)
# =======================================
# Statistical comparison (vs Zebra finch)
# =======================================
reference_species <- "TaeGut"
ref_data <- filter(sp.comp.dist.data, Species1 == reference_species)
ref_data %>%
group_by(Species1) %>%
summarise(
n = sum(!is.na(value)),
median_value = median(value, na.rm = TRUE),
sd_value = sd(value, na.rm = TRUE),
iqr_value = IQR(value, na.rm = TRUE))
results <- sp.comp.dist.data %>%
filter(Species1 != reference_species) %>%
group_by(Species1) %>%
summarise(
n = sum(!is.na(value)),
median_value = median(value, na.rm = TRUE),
sd_value = sd(value, na.rm = TRUE),
iqr_value = IQR(value, na.rm = TRUE),
p_value = if (n() > 1) {
wilcox.test(value, ref_data$value)$p.value
} else {
NA
},
W_statistic = if (n() > 1) {
wilcox.test(value, ref_data$value)$statistic
} else {
NA
},
higher_than_ref = median(value, na.rm = TRUE) > median(ref_data$value, na.rm = TRUE)
) %>%
mutate(
bonferroni_p = p.adjust(p_value, method = "bonferroni"),
bonferroni_p_formatted = format(bonferroni_p, digits = 4, scientific = FALSE)
)
print(results)
print(results)
write.table(results, file = "~/work/PAR/results/tables/pairwise_distance_summary.tsv", sep = "\t", row.names = FALSE, quote = FALSE)
# =======================================
# Add gene order by position
# =======================================
# Get gene start positions (one per gene)
PAR.genes.bed.small <- PAR.genes.bed %>%
select(name, chr_start) %>%
distinct() %>%
group_by(name) %>%
summarise(chr_start = min(chr_start)) %>%
arrange(chr_start) %>%
mutate(order = row_number())
# Merge position/order into distance data
sp.comp.dist.data.pos <- merge(sp.comp.dist.data, PAR.genes.bed.small,
by.x = "Gene", by.y = "name")
sp.comp.dist.data.pos <- sp.comp.dist.data.pos %>%
arrange(order) %>%
mutate(rownr = row_number())
write.table(sp.comp.dist.data.pos, file = "~/work/PAR/results/tables/pairwise_distance_per_gene.tsv", sep = "\t", row.names = FALSE, quote = FALSE)
# =======================================
# Plot flipped PAR heatmap
# =======================================
setwd("~/work/PAR/results/plots")
pdf("PAR_heatmap_flip.pdf", height = 10, width = 18)
ggplot(
subset(sp.comp.dist.data.pos, Species1 != "GalMod"),
aes(Species1, reorder(Gene, order), fill = value)
) +
geom_tile(color = "black", size = 0.2) +
scale_fill_viridis(direction = -1, option = "viridis") +
theme_tufte(base_size = 16) +
theme(
axis.text.x = element_text(size = 24, angle = 90, hjust = 1),
axis.text.y = element_text(size = 24)
) +
labs(title = " ", x = "", y = "", fill = "")
dev.off()
# Embed fonts for publication (for journal compatibility)
embed_fonts("PAR_heatmap_flip.pdf", outfile = "PAR_heatmap_flip_font.pdf")
# =======================================
# Boxplot by species (with colors)
# =======================================
myColors <- ifelse(levels(sp.comp.dist.data$Species1)=="AlaArv" , rgb(163/255,213/255,91/255) ,
ifelse(levels(sp.comp.dist.data$Species1)=="CisJun", rgb(163/255,213/255,91/255),
ifelse(levels(sp.comp.dist.data$Species1)=="SylBra", rgb(163/255,213/255,91/255),
"gold" ) ))
pdf("pairwise_distance_fusion_types3_new.pdf", width = 10, height = 5)
boxplot(value ~ Species1, data = sp.comp.dist.data,
notch = FALSE,
col = myColors,
main = " ", xlab = " ")
dev.off()
# Embed for publication
embed_fonts("pairwise_distance_fusion_types3_new.pdf",
outfile = "pairwise_distance_fusion_types3_new_font.pdf")
# =======================================
# Fusion Category Comparisons
# =======================================
# Extra fusion group
extra.fusion <- c("AlaArv", "PanBia", "EreAlp", "SylBra", "CisJun")
extra.sc.sp.comp.dist.data.pos <- filter(sp.comp.dist.data.pos, Species1 %in% extra.fusion) %>%
mutate(type = "Extra fusion")
# No extra fusion group
no.extra.sc.sp.comp.dist.data.pos <- filter(sp.comp.dist.data.pos, !Species1 %in% extra.fusion) %>%
mutate(type = "No extra fusion")
# Sex-linked PAR group
sex_linked_species <- c("AlaArv", "SylBra", "CisJun")
sex_linked_PAR.sp.comp.dist.data.pos <- filter(sp.comp.dist.data.pos, Species1 %in% sex_linked_species) %>%
mutate(PAR_type = "Extra fusion")
# Autosomal PAR group
autosomal_PAR.sp.comp.dist.data.pos <- filter(sp.comp.dist.data.pos, !Species1 %in% sex_linked_species) %>%
mutate(PAR_type = "No extra fusion")
# =======================================
# Annotate species full names
# =======================================
sp.comp.dist.data.pos$species_name <- "null"
sp.comp.dist.data.pos <- sp.comp.dist.data.pos %>%
mutate(species_name = recode(Species1,
"TaeGut" = "Taeniopygia guttata (Group I: Z)",
"AlaArv" = "Alauda arvensis (Group VII: Z;4A,3,5)*",
"EreAlp" = "Eremophila alpestris (Group VI: Z;4A,3)",
"PanBia" = "Panurus biarmicus (Group V: Z;4A;3;5)",
"SylBra" = "Sylvietta brachyura (Group IV: Z;4A;8)*",
"AegCau" = "Aegithalos caudatus (Group II: Z;4A)",
"AcrSch" = "Acrocephalus schoenobaenus (Group II: Z;4A)",
"CetCet" = "Cettia cetti (Group II: Z;4A)",
"CisJun" = "Cisticola juncidis (Group III: Z;4A;4)*",
"PhyCol" = "Phylloscopus collybita (Group II: Z;4A)",
"CecDau" = "Cecropis daurica (Group II: Z;4A)",
"FicAlb" = "Ficedula albicollis (Group I: Z)",
"PycBar" = "Pycnonotus barbatus (Group II: Z;4A)",
"SylAtr" = "Sylvia atricapilla (Group II: Z;4A)",
"LocLus" = "Locustella luscinioides (Group II: Z;4A)",
"ArgAlt" = "Argya altirostris (Group II: Z;4A)"
))
# =======================================
# Faceted Spearman Correlation Plot
# =======================================
setwd("~/work/PAR/results")
# Prepare chromosome start position (Mb)
sp.comp.dist.data.pos$chr_start_mb <- sp.comp.dist.data.pos$chr_start / 1e6
# Sort unique species names
sp_name <- unique(sp.comp.dist.data.pos$species_name)
# Define color palette
cols <- viridis(17, alpha = 1, begin = 0, end = 1, option = "D")
# Helper function for axis formatting
scaleFUN <- function(x) sprintf("%.3f", x)
scaleFUN2 <- function(x) sprintf("%.1f", x)
cor_stats <- data.frame(
species = character(),
rho = numeric(),
p_value = numeric(),
N = integer(),
stringsAsFactors = FALSE
)
plot_list <- list()
for (i in seq_along(sp_name)) {
data_i <- filter(sp.comp.dist.data.pos, species_name == sp_name[i])
y_limit <- max(data_i$value, na.rm = TRUE) + 0.001
n_value <- sum(!is.na(data_i$value))
# Compute Spearman test
cor_test <- cor.test(data_i$chr_start_mb, data_i$value, method = "spearman", exact = FALSE)
#rho_n_label <- sprintf("ρ = %.2f; p = %.3g; N = %d", cor_test$estimate, cor_test$p.value, n_value)
rho_n_label <- bquote(italic(rho) == .(round(cor_test$estimate, 2)) * "; p = " * .(formatC(cor_test$p.value, format = "g", digits = 3)) * "; N = " * .(n_value))
# Determine whether to add regression line
reg_line <- if (cor_test$p.value < 0.05) "reg.line" else "none"
p <- ggscatter(
data_i, x = "chr_start_mb", y = "value",
size = 5, shape = 21,
fill = "#6A9F33", # slightly darker green for clarity
add = reg_line, conf.int = FALSE,
repel = TRUE,
title = sp_name[i],
xlab = "Z chromosome position (Mb)", ylab = "Female-male pairwise distance",
show.legend.text = FALSE, show.legend = FALSE,
cor.coef = FALSE
) +
annotate("text", x = 0.05, y = y_limit*0.9, label = rho_n_label,
size = 6, color = "black", fontface = "italic", family = "Helvetica", hjust = 0) +
scale_y_continuous(labels = scaleFUN) +
scale_x_continuous(labels = scaleFUN2) +
theme_minimal(base_family = "Helvetica") +
theme(
plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
axis.text = element_text(size = 16, colour = "black"),
axis.title = element_text(size = 14),
panel.border = element_rect(color = "black", fill = NA, linewidth = 0.8)
)
cor_stats <- rbind(
cor_stats,
data.frame(
species = as.character(sp_name[i]),
rho = as.numeric(cor_test$estimate),
p_value = cor_test$p.value,
N = n_value,
stringsAsFactors = FALSE
)
)
plot_list[[i]] <- p
}
plot_list[[6]] <- plot_list[[6]] + theme(axis.text.y = element_text(size = 16), axis.title.y = element_text(size = 14))
plot_list[[7]] <- plot_list[[7]] + theme(axis.text.y = element_text(size = 16), axis.title.y = element_blank())
plot_list[[1]] <- plot_list[[1]] + theme(axis.text.y = element_text(size = 16), axis.title.y = element_blank())
plot_list[[9]] <- plot_list[[9]] + theme(axis.text.y = element_text(size = 16), axis.title.y = element_text(size = 14))
plot_list[[8]] <- plot_list[[8]] + theme(axis.text.y = element_text(size = 16), axis.title.y = element_blank()) # Hide the label
plot_list[[11]] <- plot_list[[11]] + theme(axis.text.y = element_text(size = 16), axis.title.y = element_blank()) # Hide the label
plot_list[[15]] <- plot_list[[15]] + theme(axis.text.y = element_text(size = 16), axis.title.y = element_text(size = 14))
plot_list[[13]] <- plot_list[[13]] + theme(axis.text.y = element_text(size = 16), axis.title.y = element_blank()) # Hide the label
plot_list[[12]] <- plot_list[[12]] + theme(axis.text.y = element_text(size = 16), axis.title.y = element_blank()) # Hide the label
plot_list[[5]] <- plot_list[[5]] + theme(axis.text.y = element_text(size = 16), axis.title.y = element_text(size = 14))
plot_list[[4]] <- plot_list[[4]] + theme(axis.text.y = element_text(size = 16), axis.title.y = element_blank()) # Hide the label
# Define each row
row1 <- plot_list[[6]] + plot_list[[7]] + plot_list[[1]] + plot_layout(nrow = 1, guides = "collect", tag_level = "keep")
row2 <- plot_list[[9]] + plot_list[[8]] + plot_list[[11]] + plot_layout(nrow = 1, guides = "collect")
row3 <- plot_list[[15]] + plot_list[[13]] + plot_list[[12]] + plot_layout(nrow = 1, guides = "collect")
row4 <- plot_list[[5]] + plot_list[[4]] + plot_spacer() + plot_layout(nrow = 1, widths = c(1, 1, 1.14))
# Combine all rows
final_plot <- (row1 / row2 / row3 / row4) +
plot_layout(
guides = "collect",
widths = c(1, 1, 1) # Adjust the widths to control the layout of the first three columns
) &
theme(
legend.position = "none",
plot.margin = margin(0, 0, 0, 0), # Set equal margins
panel.spacing = unit(0.5, "cm") # Ensure consistent spacing between plots
)
final_plot <- final_plot +
plot_annotation(tag_levels = 'a') &
theme(plot.tag = element_text(size = 20, face = "bold"))
ggsave("~/work/PAR/results/plots/Figure5_combined.pdf", final_plot, width = 18, height = 16)
plot_list[1] # AcrSch
plot_list[2] # SylBra
plot_list[3] # AlaArv
plot_list[4] # EreAlp
plot_list[5] # PanBia
plot_list[6] # TaeGut
plot_list[7] # FicAlb
plot_list[8] # AegCau
plot_list[9] # CetCet
plot_list[10] # CisJun
plot_list[11] # PhyCol
plot_list[12] # LocLus
plot_list[13] # CecDau
plot_list[14] # SylAtr
plot_list[15] # ArgAlt
plot_list[16] # PycBar
plot_list[[16]] <- plot_list[[16]] + theme(axis.text.y = element_text(size = 16), axis.title.y = element_text(size = 14))
plot_list[[14]] <- plot_list[[14]] + theme(axis.text.y = element_text(size = 16), axis.title.y = element_blank()) # Hide the label
plot_list[[10]] <- plot_list[[10]] + theme(axis.text.y = element_text(size = 16), axis.title.y = element_blank()) # Hide the label
plot_list[[2]] <- plot_list[[2]] + theme(axis.text.y = element_text(size = 16), axis.title.y = element_text(size = 14))
plot_list[[3]] <- plot_list[[3]] + theme(axis.text.y = element_text(size = 16), axis.title.y = element_blank()) # Hide the label
# Define each row
Srow1 <- plot_list[[16]] + plot_list[[14]] + plot_list[[10]] + plot_layout(nrow = 1, guides = "collect", tag_level = "keep")
Srow2 <- plot_list[[2]] + plot_list[[3]] + plot_spacer() + plot_layout(nrow = 1, widths = c(1, 1, 1.14))
# Combine all rows
final_plot <- (Srow1 / Srow2) +
plot_layout(
guides = "collect",
widths = c(1, 1, 1) # Adjust the widths to control the layout of the first three columns
) &
theme(
legend.position = "none",
plot.margin = margin(0, 0, 0, 0), # Set equal margins
panel.spacing = unit(0.5, "cm") # Ensure consistent spacing between plots
)
final_plot <- final_plot +
plot_annotation(tag_levels = 'a') &
theme(plot.tag = element_text(size = 20, face = "bold"))
ggsave("~/work/PAR/results/plots/FigureS12_combined.pdf", final_plot, width = 18, height = 8)
setwd("~/work/PAR/results/plots")
pdf("pairwise_distance_spearman.allSp.pdf", width= 14, height = 12)
ggscatter(sp.comp.dist.data.pos, x = "chr_start", y = "value", fill = "Species1",
size = 3, shape = 21, facet.by = "species_name", combine = T,xlab = "Z chromosome position",
add = "reg.line", conf.int = F,ylab = "pairwise distance", repel = T,
add.params = list(color = "Species1", fill = "grey", size = 0.75),
title = NULL, show.legend.text = FALSE,
cor.coef = TRUE,
scales = "free_y",
cor.coeff.args = list(method = "spearman", cor.coef.name = "rho"))
dev.off()
embed_fonts("pairwise_distance_spearman.allSp.pdf", outfile="pairwise_distance_spearman.allSp_font.pdf")
# =======================================
# Private allele stats and plots
# =======================================
Z.singletons=read.table("~/work/sylvietta_PAR/results/PAR_FicAlb/allSp.Z.genes.singletons.S.sum.krakenSNPs.out",header=FALSE,fill=TRUE,stringsAsFactor=FALSE)
Z.singletons <- plyr::rename(Z.singletons, c("V1"="nr.singletons", "V2"="Gene","V3"="indv", "V4"="species", "V5"="sex"))
Z.singletons <- subset(Z.singletons, c(species != "AlaRaz" & species != "GalMod"))
Z.singletons$species <- sub("HirDau", "CecDau", Z.singletons$species)
Z.singletons$species <- sub("TurAlt", "ArgAlt", Z.singletons$species)
Z.singletons$species <- sub("\\_1EV02922", "", Z.singletons$species)
specieslist <- unique(Z.singletons$species)
samples.sex$species <- sub("HirDau", "CecDau", samples.sex$species)
samples.sex$species <- sub("TurAlt", "ArgAlt", samples.sex$species)
samples.sex$species <- sub("\\_1EV02922", "", samples.sex$species)
samples.sex.15 <- samples.sex[ samples.sex$species %in% specieslist, ]
samples.Z.gene.combination <- crossing(ZF.Z.genes.startPos, samples.sex.15)
Z.singletons <- merge(Z.singletons, samples.Z.gene.combination, all=TRUE, by=c("Gene", "sex", "indv", "species"))
Z.singletons[is.na(Z.singletons)] <- 0
PAR.genes.bed.small2 <- unique(PAR.genes.bed %>% select(Gene, name, chr_start))
PAR.genes.bed.small2 <- aggregate(chr_start ~ name + Gene, PAR.genes.bed.small2, function(x) min(x))
PAR.genes.bed.small2 <- PAR.genes.bed.small2[order(PAR.genes.bed.small2$chr_start),]
PAR.genes.bed.small2$order <- seq.int(nrow(PAR.genes.bed.small2))
Z.singletons.PAR <- merge(Z.singletons, PAR.genes.bed.small2, by=c("Gene"))
Z.singletons.PAR$species <- ordered(Z.singletons.PAR$species,
levels = c("TaeGut", "FicAlb", "CetCet", "AegCau", "PhyCol", "PycBar", "SylAtr", "ArgAlt", "CecDau", "LocLus", "AcrSch", "CisJun",
"SylBra", "PanBia", "EreAlp", "AlaArv"))
sc.genes2 <- unique(sp.comp.dist.data.pos.symbol$Gene)
Z.singletons.PAR <- Z.singletons.PAR[ Z.singletons.PAR$name %in% sc.genes2, ]
pdf("~/work/PAR/results/plots/PAR_heatmap_singletons_per_sex.pdf", height = 10, width =10)
ggplot(Z.singletons.PAR, aes(reorder(name, order),species, fill= nr.singletons)) +
geom_tile( colour = "black", size = 0.5) + scale_fill_viridis(direction = -1, option = "viridis") +
theme_tufte(base_family="Helvetica", base_size = 12) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
theme(axis.text.x=element_text(size=18, angle=90), axis.text.y=element_text(size=18)) + labs(title="Per sex number of singletons", x="", y="", fill="") + facet_wrap(~sex, ncol = 1)
dev.off()
Z.singletons.PAR_wide <- dcast(Z.singletons.PAR, species + Gene + Trans + order + name ~ sex, value.var="nr.singletons")
pdf("~/work/PAR/results/PAR_heatmap_singletons_sex_diff.pdf", height = 10, width =10)
ggplot(Z.singletons.PAR_wide, aes(reorder(name, order),species, fill= female-male)) +
geom_tile( colour = "black", size = 0.5) + scale_fill_viridis(direction = -1, option = "viridis") +
theme_tufte(base_size = 18, base_family = "serif")+ theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
theme(axis.text.x=element_text(size=18, angle=90), axis.text.y=element_text(size=18)) + labs(title="Female-male difference in singletons", x="", y="", fill="")
dev.off()
### Wilcoxon tests for singletons
results <- Z.singletons.PAR_wide %>%
group_by(species) %>%
summarise(
p_value = if (n() > 1) {
wilcox.test(female, male, paired = TRUE)$p.value
} else {
NA # Not enough data for the test
},
female_higher = mean(female > male) # Proportion of cases where female > male
)
results <- Z.singletons.PAR_wide %>%
group_by(species) %>%
summarise(
median_female = median(female, na.rm = TRUE), # Mean value for females
median_male = median(male, na.rm = TRUE), # Mean value for males
n_male = sum(!is.na(male)),
sd_male = sd(male, na.rm = TRUE),
iqr_male = IQR(male, na.rm = TRUE),
n_female = sum(!is.na(female)),
sd_female = sd(female, na.rm = TRUE),
iqr_female = IQR(female, na.rm = TRUE),
p_value = if (n() > 1) {
wilcox.test(female, male, paired = TRUE)$p.value
} else {
NA # Not enough data for the test
},
W_statistic = if (n() > 1) {
wilcox.test(female, male, paired = TRUE)$statistic
} else {
NA
},
female_higher = mean(female > male, na.rm = TRUE) # Proportion of cases where female > male
) %>%
mutate(
bonferroni_p = p.adjust(p_value, method = "bonferroni")
)
outname <- sprintf("~/work/PAR/results/tables/singleton_wilcoxon.tsv")
write.table(results, file = outname, sep = "\t", quote = FALSE, row.names = F)
# =======================================
# Read depth stats and plots
# =======================================
# Depth stats
Z.depth=read.table("~/work/sylvietta_PAR/results/datasets/allSp.Z.genes.bamstat04.geneInfo.out",header=FALSE,fill=TRUE,stringsAsFactor=FALSE)
Z.depth <- plyr::rename(Z.depth, c("V1"="indv", "V2"="min.depth","V3"="max.depth","V4"="avg.depth", "V5"="median.depth","V6"="species","V7"="Gene","V8"="Trans"))
Z.depth <- subset(Z.depth, c(Z.depth$species!="AlaRaz" & Z.depth$species!="GalMod"))
Z.depth$species <- sub("HirDau", "CecDau", Z.depth$species)
Z.depth$species <- sub("TurAlt", "ArgAlt", Z.depth$species)
Z.depth$species <- sub("\\_1EV02922", "", Z.depth$species)
Z.depth.mean <- aggregate(avg.depth ~ species + Gene + indv + Trans + indv, Z.depth, mean)
Z.depth.mean.combination <- merge(Z.depth.mean, samples.Z.gene.combination)
Z.depth.mean.combination_wide <- dcast(Z.depth.mean.combination, species + Gene + Trans + chr_start ~ sex, value.var="avg.depth")
Z.depth.mean.combination_wide <- subset(Z.depth.mean.combination_wide, c(Z.depth.mean.combination_wide$female>5 & Z.depth.mean.combination_wide$female<80))
Z.depth.mean.combination_wide <- subset(Z.depth.mean.combination_wide, c(Z.depth.mean.combination_wide$male>5 & Z.depth.mean.combination_wide$male<80))
aggregate(male ~ species, data = Z.depth.mean.combination_wide, mean)
### Calculate coverage ratio and remove outliers
Z.depth.mean.combination_wide$cov_ratio <- Z.depth.mean.combination_wide$female/Z.depth.mean.combination_wide$male
Z.depth.mean.combination_wide <- subset(Z.depth.mean.combination_wide, c(Z.depth.mean.combination_wide$cov_ratio < 2.5))
Z.depth.mean.combination_wide.median <- aggregate(cov_ratio ~ species, data = Z.depth.mean.combination_wide, median)
Z.depth.mean.combination_wide.median <- plyr::rename(Z.depth.mean.combination_wide.median, c("cov_ratio"="species_median"))
Z.depth.mean.combination_wide <- merge(Z.depth.mean.combination_wide, Z.depth.mean.combination_wide.median, by=c("species"))
Z.depth.mean.combination_wide$cov_ratio_scaled <- Z.depth.mean.combination_wide$cov_ratio / Z.depth.mean.combination_wide$species_median
Z.depth.mean.combination_wide <- na.omit(Z.depth.mean.combination_wide)
Z.depth.mean.combination_wide$cov_ratio_scaled <- Z.depth.mean.combination_wide$cov_ratio_scaled - 0.5
Z.depth.mean.combination_wide.PAR <- merge(Z.depth.mean.combination_wide, PAR.genes.bed.small2, by=c("Gene"))
Z.depth.mean.combination_wide.PAR <- Z.depth.mean.combination_wide.PAR[ Z.depth.mean.combination_wide.PAR$name %in% sc.genes2, ]
#Z.depth.mean.combination_wide.PAR$species <- sub("HirDau", "CecDau", Z.depth.mean.combination_wide.PAR$species)
Z.depth.mean.combination_wide$species_name <- "null"
Z.depth.mean.combination_wide <- Z.depth.mean.combination_wide %>%
mutate(species_name = recode(species,
"TaeGut" = "Taeniopygia guttata (Group I: Z)",
"AlaArv" = "Alauda arvensis (Group VII: Z;4A,3,5)*",
"EreAlp" = "Eremophila alpestris (Group VI: Z;4A,3)",
"PanBia" = "Panurus biarmicus (Group V: Z;4A;3;5)",
"SylBra" = "Sylvietta brachyura (Group IV: Z;4A;8)*",
"AegCau" = "Aegithalos caudatus (Group II: Z;4A)",
"AcrSch" = "Acrocephalus schoenobaenus (Group II: Z;4A)",
"CetCet" = "Cettia cetti (Group II: Z;4A)",
"CisJun" = "Cisticola juncidis (Group III: Z;4A;4)*",
"PhyCol" = "Phylloscopus collybita (Group II: Z;4A)",
"CecDau" = "Cecropis daurica (Group II: Z;4A)",
"FicAlb" = "Ficedula albicollis (Group I: Z)",
"PycBar" = "Pycnonotus barbatus (Group II: Z;4A)",
"SylAtr" = "Sylvia atricapilla (Group II: Z;4A)",
"LocLus" = "Locustella luscinioides (Group II: Z;4A)",
"ArgAlt" = "Argya altirostris (Group II: Z;4A)"
))
options(scipen=999)
pdf("~/work/PAR/results/plots/FigureS1_PAR_genes_depth.pdf", height = 10, width =15)
ggplot(subset(Z.depth.mean.combination_wide, chr_start<10000000), aes(x = chr_start/1000000, y = cov_ratio_scaled, fill = species_name)) +
geom_rect(mapping=aes(xmin=0, xmax=0.450000, ymin=-Inf, ymax=Inf), fill="grey" , alpha=0.5) +
geom_point(pch = 21) + facet_wrap(~species_name) + theme_minimal() +geom_hline(yintercept = 0.5) +
labs(x = "Z chromosome position (Mb)", y = "Female-to-male genome coverage ratio") +
theme_bw(base_family="Helvetica", base_size = 12) + theme(legend.position="none")
dev.off()
pdf("~/work/PAR/results/plots/PAR_genes_depth.female.pdf", height = 10, width =15)
ggplot(subset(Z.depth.mean.combination_wide, chr_start<10000000), aes(x = chr_start, y = female, fill = species)) +
geom_rect(mapping=aes(xmin=0, xmax=450000, ymin=-Inf, ymax=Inf), fill="grey" , alpha=0.5) +
geom_point(pch = 21) + facet_wrap(~species) + theme_minimal() +geom_hline(yintercept = 0.5)
dev.off()
pdf("~/work/PAR/results/plots/PAR_genes_depth.male.pdf", height = 10, width =15)
ggplot(subset(Z.depth.mean.combination_wide, chr_start<10000000), aes(x = chr_start, y = male, fill = species)) +
geom_rect(mapping=aes(xmin=0, xmax=450000, ymin=-Inf, ymax=Inf), fill="grey" , alpha=0.5) +
geom_point(pch = 21) + facet_wrap(~species) + theme_minimal() +geom_hline(yintercept = 0.5)
dev.off()